health <- read.csv("Student Mental health.csv")
names(health) <- c('Timestamp', 'Gender', 'Age', 'Course', 'Year', 'CGPA', 'Married', 'Depression', 'Anxiety', 'Panic_Attack', 'Treatment')
indx <- apply(health, 2, function(x) any(is.na(x)))
indx
## Timestamp Gender Age Course Year CGPA
## FALSE FALSE TRUE FALSE FALSE FALSE
## Married Depression Anxiety Panic_Attack Treatment
## FALSE FALSE FALSE FALSE FALSE
# There is missing data in our AGE column
which(is.na(health$Age))
## [1] 44
# observation 44 -> since it is only one individual with missing data, we'd look to replace the missing with the median age of this group to keep the numbers clean rather than using a decimal as a mean.
health[44,'Age'] <- median(health$Age, na.rm =T)
Health_SummaryStat <- health %>%
group_by(Gender) %>%
summarise(count = n(),
percentage = round((n()/ nrow(health)), digits = 4))
Health_SummaryStat
## # A tibble: 2 × 3
## Gender count percentage
## <chr> <int> <dbl>
## 1 Female 75 0.743
## 2 Male 26 0.257
colors <- c('rgb(211,94,96)','rgb(114,147,203)')
Gender_PieChart <- plot_ly(data = Health_SummaryStat, labels = ~Gender, values = ~percentage,
type = 'pie', sort = F,
textposition = 'inside',
textinfo = 'label+percent',
insidetextfont = list(color = 'White'),
hoverinfo = 'text',
text = ~count,
marker = list(colors = colors,
line = list(color = 'Black', width = 1)),
showlegend = TRUE)
Gender_PieChart <- Gender_PieChart %>% layout(title = 'Pie Chart of Gender')
Gender_PieChart
74.3% of observations were female compared to 25.7%
Health_SummaryStat2 <- health %>%
group_by(Depression) %>%
summarise(count = n(),
percentage = round((n()/ nrow(health)), digits = 4))
Depression_PieChart <- plot_ly(data = Health_SummaryStat2, labels = ~Depression, values = ~percentage,
type = 'pie', sort = F,
textposition = 'inside',
textinfo = 'label+percent',
insidetextfont = list(color = 'White'),
hoverinfo = 'text',
text = ~count,
marker = list(colors = colors,
line = list(color = 'Black', width = 1)),
showlegend = TRUE)
Depression_PieChart %>% layout(title = 'Pie Chart of Depression')
34.6 % of sample had depression compared to an average 5.0% rate amongst adults in the population.
health %>%
count(Gender, Depression, sort = F) %>%
group_by(Gender) %>%
mutate(prop = round((n / sum(n)),digits = 4)) %>%
plot_ly(x = ~Gender, y=~prop, color = ~Depression, type = "bar",
text = ~paste(Gender, prop*100 ,'%'),
textposition = 'outside') %>%
layout(barmode = 'Stacked',
title = 'Barplot of Depression amongst Genders')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
Females tended to on average have more cases of depression compared to male. (38.67% vs 23.08%)
health$CGPA <- as.factor(health$CGPA)
levels(health$CGPA)
## [1] "0 - 1.99" "2.00 - 2.49" "2.50 - 2.99" "3.00 - 3.49" "3.50 - 4.00"
## [6] "3.50 - 4.00 "
# Levels are in order ~
health %>%
group_by(CGPA)%>%
summarize(count = n()) %>%
plot_ly(x =~CGPA, y=~count, type = 'bar',
text = ~count,
textposition = 'outside',
marker = list(color = 'rgb(158,202,225)',
line = list(color = 'black',
width = 1.0))) %>%
layout(title = 'Distibution of CGPA')
health %>%
count(CGPA, Depression, sort = F) %>%
mutate(proportion = round((n/sum(n)),digits=4)) %>%
plot_ly(x =~CGPA, y=~proportion, color = ~Depression, type = 'bar') %>%
layout(barmode = 'Group',
title = 'Barplot of Depression vs CGPA')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
We aren’t able to see any clear patterns with CGPA and Depression, but we do notice a non-monotonic increase in depression rate as CGPA increases. This might be due to inadequate group sizes, and if given a more percise CGPA value, we would of been able to get a clearer picture. But with this data set we do notice that CGPA has a slight positive associate with depression rate.
health %>%
group_by(Course) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
filter(count >2)
## # A tibble: 5 × 2
## Course count
## <chr> <int>
## 1 BCS 18
## 2 Engineering 17
## 3 BIT 10
## 4 Biomedical science 4
## 5 KOE 4
## Lets look at coures BCS, Engineering, BIT, Biomedical science , and KOE
health %>%
filter(grepl('BIT|KOE|BCS|Engineering|Biomedical science', Course)) %>%
count(Course, Depression, sort = T) %>%
group_by(Course) %>%
mutate(prop = round((n / sum(n)),digits = 4)) %>%
plot_ly(x = ~Course, y=~n, color = ~Depression, type = "bar",
text = ~paste(Course, n),
textposition = 'outside') %>%
layout(barmode = 'Stacked',
title = 'Barplot of Depression amongst the top 5 Courses')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
Engineering students seem to have the most cases of depression, and surprisingly 0 came out of the Bio medical group. Group sizes too small to make any conclusive associations.